1 SALIVA-COVID-19 PCA-EFA-UPSTM

1.0.1 Loading the libraries

library("FRESA.CAD")
library(psych)
library(whitening)
library("vioplot")

library(readxl)
op <- par(no.readonly = TRUE)
pander::panderOptions('digits', 3)
pander::panderOptions('table.split.table', 400)
pander::panderOptions('keep.trailing.zeros',TRUE)

1.1 Data: The COVID_19 Data-Set

The data to process is described in:

https://zenodo.org/record/4156647#.Y1bSF3bMKUk

Thermal Saliva Testing Dataset

10.5281/zenodo.4156647 https://doi.org/10.5281/zenodo.4156647

I added a column to the data identifying the repeated experiments.


SalivaThermal <- as.data.frame(read_excel("~/GitHub/FCA/Data/SalivaThermal_Source_Data_2.xlsx"))


SalivaThermal_set1 <- subset(SalivaThermal,RepID==1)
rownames(SalivaThermal_set1) <- SalivaThermal_set1$ID
SalivaThermal_set1$RepID <- NULL
SalivaThermal_set1$ID <- NULL
SalivaThermal_set1$Ct <- NULL

SalivaThermal_set2 <- subset(SalivaThermal,RepID==2)
rownames(SalivaThermal_set2) <- SalivaThermal_set2$ID
SalivaThermal_set2$RepID <- NULL
SalivaThermal_set2$ID <- NULL
SalivaThermal_set2$Ct <- NULL

SalivaThermal_set3 <- subset(SalivaThermal,RepID==3)
rownames(SalivaThermal_set3) <- SalivaThermal_set3$ID
SalivaThermal_set3$RepID <- NULL
SalivaThermal_set3$ID <- NULL
SalivaThermal_set3$Ct <- NULL

SalivaThermal_Avg <- (SalivaThermal_set1 + SalivaThermal_set2 + SalivaThermal_set3)/3

colnames(SalivaThermal_Avg) <- paste("V",colnames(SalivaThermal_Avg),sep="_")

SalivaThermal_Avg$class <- 1*(str_detect(rownames(SalivaThermal_Avg),"P"))

1.1.0.1 Training and testing sets

dataframe <- SalivaThermal_Avg
outcome <- "class"

trainFraction <- 0.5
rhoThreshold <- 0.8
TopVariables <- 5
aucTHR <- 0.55

set.seed(10)
trainSample <- sample(nrow(dataframe),nrow(dataframe)*trainFraction)

trainDataFrame <- dataframe[trainSample,]
testDataFrame <- dataframe[-trainSample,]

1.1.1 Data specs

pander::pander(c(rows=nrow(dataframe),col=ncol(dataframe)-1))
rows col
61 251
pander::pander(table(dataframe[,outcome]))
0 1
30 31
pander::pander(table(trainDataFrame[,outcome]))
0 1
15 15
pander::pander(table(testDataFrame[,outcome]))
0 1
15 16

varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]

1.2 Univariate


univariate_columns <- c("caseMean","caseStd","controlMean","controlStd","controlKSP","IDI","ROCAUC","cStatCorr")
univar <- uniRankVar(varlist,
               paste(outcome,"~1"),
               outcome,
               trainDataFrame,
              rankingTest = "CStat")

100 : V_1102 200 : V_902



#univar$orderframe[1:5,univariate_columns]
univarTest <- uniRankVar(varlist,
               paste(outcome,"~1"),
               outcome,
               trainDataFrame,
               testData=testDataFrame,
              rankingTest = "CStat")

100 : V_1102 200 : V_902


univar$orderframe$BACC <- (univar$orderframe$Sensitivity + univar$orderframe$Specificity)/2.0
univarTest$orderframe$BACC <- (univarTest$orderframe$Sensitivity + univarTest$orderframe$Specificity)/2.0

#pROC::roc(trainDataFrame$class,trainDataFrame[,univar$orderframe$Name[1]],direction=">",plot=TRUE,auc=TRUE,quiet = TRUE)

1.3 Decorrelation with UPSTM Blind

DEdataframe <- IDeA(trainDataFrame,thr=rhoThreshold)
predTestDe <- predictDecorrelate(DEdataframe,testDataFrame)

ltvar <- getLatentCoefficients(DEdataframe);
pander::pander(head(ltvar))
  • La_V_1300:

    V_1300 V_1286 V_1246
    1 -1.34 0.301
  • La_V_1298:

    V_1300 V_1298 V_1286 V_1246
    -0.835 1 -0.152 -0.0117
  • La_V_1296:

    V_1300 V_1296 V_1292 V_1246
    -0.471 1 -0.528 -0.00107
  • La_V_1294:

    V_1300 V_1296 V_1294 V_1292 V_1246
    0.105 -0.702 1 -0.404 0.00143
  • La_V_1292:

    V_1300 V_1298 V_1296 V_1292 V_1286 V_1246
    1.3 -3.54 2.7 -0.425 -0.0358 0.00424
  • La_V_1290:

    V_1300 V_1296 V_1294 V_1292 V_1290 V_1246
    0.435 -3.43 6.14 -4.14 1 0.00293
pander::pander(c(Avlen=mean(sapply(ltvar,length))))
Avlen
5.08
pander::pander(c(Latent=length(ltvar)))
Latent
250


varlistDe <-  colnames(DEdataframe)[colnames(DEdataframe) != outcome];
univarDe <- uniRankVar(varlistDe,
              paste(outcome,"~1"),
                outcome,
              DEdataframe,
              rankingTest = "CStat")

100 : La_V_1102 200 : La_V_902


univarDeTest <- uniRankVar(varlistDe,
              paste(outcome,"~1"),
                outcome,
              DEdataframe,
              testData=predTestDe,
              rankingTest = "CStat")

100 : La_V_1102 200 : La_V_902


univarDe$orderframe$BACC <- (univarDe$orderframe$Sensitivity + univarDe$orderframe$Specificity)/2.0
univarDeTest$orderframe$BACC <- (univarDeTest$orderframe$Sensitivity + univarDeTest$orderframe$Specificity)/2.0

#univarDe$orderframe[1:5,univariate_columns]
#univarDeTest$orderframe[1:5,univariate_columns]

#pROC::roc(DEdataframe$class,DEdataframe[,univarDe$orderframe$Name[1]],direction=">",plot=TRUE,auc=TRUE,quiet = TRUE)

1.4 Decorrelation with UPSTM Blind/Spearman

DEdataframeSpear <- IDeA(trainDataFrame,thr=rhoThreshold,method="spearman")
predTestDeSpear <- predictDecorrelate(DEdataframeSpear,testDataFrame)

ltvar <- getLatentCoefficients(DEdataframeSpear);
pander::pander(head(ltvar))
  • La_V_1300:

    V_1300 V_1298 V_1296
    1 -2.08 1.08
  • La_V_1296:

    V_1298 V_1296
    -0.998 1
  • La_V_1294:

    V_1298 V_1294 V_1292
    -0.316 1 -0.684
  • La_V_1292:

    V_1298 V_1292 V_1286
    -0.474 1 -0.524
  • La_V_1290:

    V_1300 V_1298 V_1296 V_1292 V_1290 V_1286
    -0.451 1.04 -0.486 -0.889 1 -0.217
  • La_V_1288:

    V_1298 V_1292 V_1290 V_1288
    -0.139 1.55 -2.41 1
pander::pander(c(Avlen=mean(sapply(ltvar,length))))
Avlen
5.11
pander::pander(c(Latent=length(ltvar)))
Latent
250


varlistDeSpear <-  colnames(DEdataframeSpear)[colnames(DEdataframeSpear) != outcome];
univarDeSpear <- uniRankVar(varlistDeSpear,
              paste(outcome,"~1"),
                outcome,
              DEdataframeSpear,
              rankingTest = "CStat")

100 : La_V_1102 200 : La_V_902


univarDeSpearTest <- uniRankVar(varlistDeSpear,
              paste(outcome,"~1"),
                outcome,
              DEdataframeSpear,
              testData=predTestDeSpear,
              rankingTest = "CStat")

100 : La_V_1102 200 : La_V_902


univarDeSpear$orderframe$BACC <- (univarDeSpear$orderframe$Sensitivity + univarDeSpear$orderframe$Specificity)/2.0
univarDeSpearTest$orderframe$BACC <- (univarDeSpearTest$orderframe$Sensitivity + univarDeSpearTest$orderframe$Specificity)/2.0

1.5 Decorrelation with UPSTM Driven


DriDEdataframe <- IDeA(trainDataFrame,Outcome=outcome,thr=rhoThreshold)
predTestDri <- predictDecorrelate(DriDEdataframe,testDataFrame)


ltvar <- getLatentCoefficients(DriDEdataframe);
pander::pander(head(ltvar))
  • La_V_1300:

    V_1300 V_1286 V_1246
    1 -1.34 0.301
  • La_V_1298:

    V_1300 V_1298 V_1286 V_1246
    -0.835 1 -0.152 -0.0117
  • La_V_1296:

    V_1300 V_1296 V_1292 V_1246
    -0.471 1 -0.528 -0.00107
  • La_V_1294:

    V_1300 V_1296 V_1294 V_1292 V_1246
    0.105 -0.702 1 -0.404 0.00143
  • La_V_1292:

    V_1300 V_1298 V_1296 V_1292 V_1286 V_1246
    1.3 -3.54 2.7 -0.425 -0.0358 0.00424
  • La_V_1290:

    V_1300 V_1296 V_1294 V_1292 V_1290 V_1246
    0.435 -3.43 6.14 -4.14 1 0.00293
pander::pander(c(Avlen=mean(sapply(ltvar,length))))
Avlen
5.08
pander::pander(c(Latent=length(ltvar)))
Latent
250


varlistDe <-  colnames(DriDEdataframe)[colnames(DriDEdataframe) != outcome];
univarDeDri <- uniRankVar(varlistDe,
              paste(outcome,"~1"),
                outcome,
              DriDEdataframe,
              rankingTest = "CStat")

100 : La_V_1102 200 : La_V_902


univarDeDriTest <- uniRankVar(varlistDe,
              paste(outcome,"~1"),
                outcome,
              DriDEdataframe,
              testData=predTestDri,
              rankingTest = "CStat")

100 : La_V_1102 200 : La_V_902


univarDeDri$orderframe$BACC <- (univarDeDri$orderframe$Sensitivity + univarDeDri$orderframe$Specificity)/2.0
univarDeDriTest$orderframe$BACC <- (univarDeDriTest$orderframe$Sensitivity + univarDeDriTest$orderframe$Specificity)/2.0

1.6 Decorrelation with UPSTM Driven and Spearman


DriDEdataframeSpear <- IDeA(trainDataFrame,Outcome=outcome,thr=rhoThreshold,method="spearman")
predTestDriSpear <- predictDecorrelate(DriDEdataframeSpear,testDataFrame)


ltvar <- getLatentCoefficients(DriDEdataframeSpear);
pander::pander(head(ltvar))
  • La_V_1300:

    V_1300 V_1298 V_1296
    1 -2.08 1.08
  • La_V_1296:

    V_1298 V_1296
    -0.998 1
  • La_V_1294:

    V_1298 V_1294 V_1292
    -0.316 1 -0.684
  • La_V_1292:

    V_1298 V_1292 V_1286
    -0.474 1 -0.524
  • La_V_1290:

    V_1300 V_1298 V_1296 V_1292 V_1290 V_1286
    -0.451 1.04 -0.486 -0.889 1 -0.217
  • La_V_1288:

    V_1298 V_1292 V_1290 V_1288
    -0.139 1.55 -2.41 1
pander::pander(c(Avlen=mean(sapply(ltvar,length))))
Avlen
5.11
pander::pander(c(Latent=length(ltvar)))
Latent
250


varlistDeSpear <-  colnames(DriDEdataframeSpear)[colnames(DriDEdataframeSpear) != outcome];
univarDeDriSpear <- uniRankVar(varlistDeSpear,
              paste(outcome,"~1"),
                outcome,
              DriDEdataframeSpear,
              rankingTest = "CStat")

100 : La_V_1102 200 : La_V_902


univarDeDriSpearTest <- uniRankVar(varlistDeSpear,
              paste(outcome,"~1"),
                outcome,
              DriDEdataframeSpear,
              testData=predTestDriSpear,
              rankingTest = "CStat")

100 : La_V_1102 200 : La_V_902


univarDeDriSpear$orderframe$BACC <- (univarDeDriSpear$orderframe$Sensitivity + univarDeDriSpear$orderframe$Specificity)/2.0
univarDeDriSpearTest$orderframe$BACC <- (univarDeDriSpearTest$orderframe$Sensitivity + univarDeDriSpearTest$orderframe$Specificity)/2.0

1.6.1 Get continous correlated features

iscontinous <- sapply(apply(trainDataFrame,2,unique),length) > 5 ## Only variables with enough samples

noclassData <- trainDataFrame[,iscontinous]
cmat <- cor(noclassData);
diag(cmat) <- 0;
maxcor <- apply(cmat,2,max);
topcor <- names(maxcor[maxcor>rhoThreshold]) ## Only correlated features will be PCA
pander::pander(c(Ncor=length(topcor)))
Ncor
251
cmat <- NULL

notcorr <- colnames(trainDataFrame)[!(colnames(trainDataFrame) %in% topcor)]
noclassData <- noclassData[,topcor]
noclassTestData <- testDataFrame[,topcor]

1.7 PCA Analysis


### PCA 

pc <- principal(noclassData,4*TopVariables,rotate="varimax")   #principal components
pander::pander(t(pc$loadings[1:TopVariables,1:TopVariables]))
  V_1300 V_1298 V_1296 V_1294 V_1292
RC3 0.8415 0.84110 0.84058 0.83999 0.83942
RC2 0.5276 0.52886 0.53026 0.53170 0.53308
RC1 0.1007 0.09950 0.09840 0.09735 0.09632
RC5 0.0274 0.02578 0.02421 0.02263 0.02110
RC4 -0.0061 -0.00472 -0.00365 -0.00279 -0.00204
PCA_Train <- as.data.frame(cbind(predict(pc,noclassData),trainDataFrame[,notcorr]))
colnames(PCA_Train) <- c(colnames(predict(pc,noclassData)),notcorr)

PCA_Predicted <- as.data.frame(cbind(predict(pc,noclassTestData),testDataFrame[,notcorr]))
colnames(PCA_Predicted) <- c(colnames(predict(pc,noclassTestData)),notcorr)

iscontinous <- colnames(PCA_Predicted)[sapply(apply(PCA_Predicted,2,unique),length) > 5] ## Only variables with enough samples
varlistPCA <-  iscontinous;

univarPCA <- uniRankVar(varlistPCA,
              paste(outcome,"~1"),
                outcome,
              PCA_Train,
              rankingTest = "CStat")

univarPCATest <- uniRankVar(varlistPCA,
              paste(outcome,"~1"),
                outcome,
              PCA_Train,
              testData=PCA_Predicted,
              rankingTest = "CStat")

univarPCA$orderframe$BACC <- (univarPCA$orderframe$Sensitivity + univarPCA$orderframe$Specificity)/2.0
univarPCATest$orderframe$BACC <- (univarPCATest$orderframe$Sensitivity + univarPCATest$orderframe$Specificity)/2.0

1.8 EFA


uls <- fa(noclassData,4*TopVariables,rotate="varimax")  #unweighted least squares is minres 
pander::pander(t(uls$weights[1:TopVariables,1:TopVariables])) 
  V_1300 V_1298 V_1296 V_1294 V_1292
MR3 0.84149 0.84112 0.84060 0.84001 0.83944
MR2 0.52779 0.52903 0.53043 0.53187 0.53325
MR1 0.09909 0.09793 0.09683 0.09577 0.09474
MR4 0.02806 0.02638 0.02474 0.02309 0.02150
MR5 -0.00623 -0.00486 -0.00379 -0.00293 -0.00218
EFA_Train <- as.data.frame(cbind(predict(uls,noclassData),trainDataFrame[,notcorr]))
colnames(EFA_Train) <- c(colnames(predict(uls,noclassData)),notcorr)
EFA_Predicted <- as.data.frame(cbind(predict(uls,noclassTestData),testDataFrame[,notcorr]))
colnames(EFA_Predicted) <- c(colnames(predict(uls,noclassTestData)),notcorr)

iscontinous <- colnames(EFA_Predicted)[sapply(apply(EFA_Predicted,2,unique),length) > 5] ## Only variables with enough 
varlistEFA <-  iscontinous
univarEFA <- uniRankVar(varlistEFA,
              paste(outcome,"~1"),
                outcome,
              EFA_Train,
              rankingTest = "CStat")

univarEFATest <- uniRankVar(varlistEFA,
              paste(outcome,"~1"),
                outcome,
              EFA_Train,
              testData=EFA_Predicted,
              rankingTest = "CStat")

univarEFA$orderframe$BACC <- (univarEFA$orderframe$Sensitivity + univarEFA$orderframe$Specificity)/2.0
univarEFATest$orderframe$BACC <- (univarEFATest$orderframe$Sensitivity + univarEFATest$orderframe$Specificity)/2.0

1.9 White

WhiteMat = whiteningMatrix(cov(noclassData), method="PCA")
sum(is.na(WhiteMat))

[1] 26104

tokeep <- apply(is.na(WhiteMat),1,sum) == 0
WhiteMat <- WhiteMat[tokeep,]
sum(is.na(WhiteMat))

[1] 0

sum(apply(abs(WhiteMat),1,sum) > 1.0e6)

[1] 118

tokeep <- apply(abs(WhiteMat),1,sum) < 1.0e6
WhiteMat <- WhiteMat[tokeep,]
sum(apply(abs(WhiteMat),1,sum) > 1.0e6)

[1] 0


pander::pander(c(ncol=ncol(WhiteMat),nrow=nrow(WhiteMat)))
ncol nrow
251 29

pander::pander(WhiteMat[1:TopVariables,1:TopVariables]) 
  V_1300 V_1298 V_1296 V_1294 V_1292
L1 0.0258 0.0257 0.0257 0.0256 0.0256
L2 0.2072 0.2050 0.2026 0.2000 0.1976
L3 0.3876 0.3914 0.3948 0.3980 0.4012
L4 0.4058 0.3814 0.3563 0.3303 0.3048
L5 0.6179 0.5499 0.4930 0.4447 0.4019
PCAWhite_Train <- as.data.frame(cbind(tcrossprod(as.matrix(noclassData), WhiteMat),trainDataFrame[,notcorr]))
colnames(PCAWhite_Train) <- c(colnames(tcrossprod(as.matrix(noclassData), WhiteMat)),notcorr)

sum(is.na(PCAWhite_Train))

[1] 0




PCAWhitePredicted <- as.data.frame(cbind(tcrossprod(as.matrix(noclassTestData), WhiteMat),testDataFrame[,notcorr]))
colnames(PCAWhitePredicted) <- c(colnames(tcrossprod(as.matrix(noclassTestData), WhiteMat)),notcorr)

sum(is.na(PCAWhitePredicted))

[1] 0


iscontinous <- colnames(PCAWhitePredicted)[sapply(apply(PCAWhitePredicted,2,unique),length) > 5] ## Only variables with enough 
varlistWhite <-  iscontinous

univarWhite <- uniRankVar(varlistWhite,
              paste(outcome,"~1"),
                outcome,
              PCAWhite_Train,
              rankingTest = "CStat")


univarWhiteTest <- uniRankVar(varlistWhite,
              paste(outcome,"~1"),
                outcome,
              PCAWhite_Train,
              testData=PCAWhitePredicted,
              rankingTest = "CStat")

univarWhite$orderframe$BACC <- (univarWhite$orderframe$Sensitivity + univarWhite$orderframe$Specificity)/2.0
univarWhiteTest$orderframe$BACC <- (univarWhiteTest$orderframe$Sensitivity + univarWhiteTest$orderframe$Specificity)/2.0

1.10 Correlation Matrices

1.10.1 RAW

par(cex=1.0,cex.main=0.8)
breaks <- c(0:5)/5.0;

cormat <- cor(testDataFrame,method="spearman")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;
pander::pander(max(abs(cormat)))

1

pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.997 0.998 0.999 1 1
pander::pander(c(Raw_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
Raw_fraction
0.988

gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  mar = c(5,5),
                  col=rev(heat.colors(5)),
                  breaks = breaks,
                  main = "Raw Correlation",
                  cexRow = 0.25,
                  cexCol = 0.25,
                  srtCol=35,
                  srtRow=75,
                  key.title=NA,
                  key.xlab="Spearman Correlation",
                  xlab="Feature", ylab="Feature"
                  )


#hist(cormat,freq=FALSE,
#     density=NULL,
#     xlim=c(-1,1),
#     ylim=c(0,4.0),
#     main="Raw Correlation",xlab="Spearman Correlation")

rawDen <- density(cormat,from=-1,to=1)
par(op)

1.10.2 UPSTM Blind

par(cex=1.0,cex.main=0.8)

## Train Correlation

cormat <- cor(DEdataframe,method="pearson")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

#pander::pander(colnames(cormat)[apply(abs(cormat),2,max)>rhoThreshold])

pander::pander(c(Train=max(abs(cormat))))
Train
0.799
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.456 0.539 0.603 0.672 0.777
pander::pander(c(IDeA_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
IDeA_fraction
0

## Test Correlation
cormat <- cor(predTestDe,method="spearman")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

pander::pander(c(Test=max(abs(cormat))))
Test
0.978
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.455 0.539 0.613 0.704 0.862
pander::pander(c(IDeA_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
IDeA_fraction
0.0029

gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  mar = c(5,5),
                  col=rev(heat.colors(5)),
                  breaks = breaks,
                  main = "Correlation after IDeA",
                  cexRow = 0.25,
                  cexCol = 0.25,
                  srtCol=35,
                  srtRow=75,
                  key.title=NA,
                  key.xlab="Spearman Correlation",
                  xlab="Feature", ylab="Feature")


#hist(cormat,freq=FALSE,
#     density=NULL,
#     xlim=c(-1,1),
#     ylim=c(0,4.0),
#     main="Correlation after UPSTM",xlab="Spearman Correlation")

DeDen <- density(cormat,from=-1,to=1)


par(op)

1.10.3 UPSTM Blind/Spearman

par(cex=1.0,cex.main=0.8)

## Train Correlation

cormat <- cor(DEdataframeSpear,method="pearson")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

#pander::pander(colnames(cormat)[apply(abs(cormat),2,max)>rhoThreshold])

pander::pander(c(Train=max(abs(cormat))))
Train
0.952
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.501 0.595 0.675 0.753 0.879
pander::pander(c(IDeA_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
IDeA_fraction
0.00485

## Test Correlation
cormat <- cor(predTestDeSpear,method="spearman")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

pander::pander(c(Test=max(abs(cormat))))
Test
0.976
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.473 0.562 0.634 0.72 0.859
pander::pander(c(IDeA_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
IDeA_fraction
0.00334

gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  mar = c(5,5),
                  col=rev(heat.colors(5)),
                  breaks = breaks,
                  main = "Correlation after IDeA:Spearman",
                  cexRow = 0.25,
                  cexCol = 0.25,
                  srtCol=35,
                  srtRow=75,
                  key.title=NA,
                  key.xlab="Spearman Correlation",
                  xlab="Feature", ylab="Feature")


#hist(cormat,freq=FALSE,
#     density=NULL,
#     xlim=c(-1,1),
#     ylim=c(0,4.0),
#     main="Correlation after UPSTM",xlab="Spearman Correlation")

DeSpearDen <- density(cormat,from=-1,to=1)

par(op)

1.10.4 UPSTM Driven

par(cex=1.0,cex.main=0.8)

## Train Correlation

cormat <- cor(DriDEdataframe,method="pearson")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;
#pander::pander(colnames(cormat)[apply(abs(cormat),2,max)>rhoThreshold])

pander::pander(c(Train=max(abs(cormat))))
Train
0.799
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.456 0.539 0.603 0.672 0.777
pander::pander(c(IDeA_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
IDeA_fraction
0

## Test Correlation
cormat <- cor(DriDEdataframe,method="spearman")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

pander::pander(c(Test=max(abs(cormat))))
Test
0.915
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.418 0.498 0.563 0.639 0.788
pander::pander(c(IDeA_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
IDeA_fraction
0.000661

gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  mar = c(5,5),
                  col=rev(heat.colors(5)),
                  breaks = breaks,
                  main = "Correlation after Driven-IDeA",
                  cexRow = 0.25,
                  cexCol = 0.25,
                  srtCol=35,
                  srtRow=75,
                  key.title=NA,
                  key.xlab="Spearman Correlation",
                  xlab="Feature", ylab="Feature")

#hist(cormat,freq=FALSE,
#     density=NULL,
#     xlim=c(-1,1),
#     ylim=c(0,4.0),
#     main="Correlation after Driven-UPSTM",xlab="Spearman Correlation")

DeDrivDen <- density(cormat,from=-1,to=1)
par(op)

1.10.5 UPSTM Spearman

par(cex=1.0,cex.main=0.8)

## Train Correlation

cormat <- cor(DriDEdataframeSpear,method="pearson")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;
#pander::pander(colnames(cormat)[apply(abs(cormat),2,max)>rhoThreshold])

pander::pander(c(Train=max(abs(cormat))))
Train
0.952
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.501 0.595 0.675 0.753 0.879
pander::pander(c(IDeAS_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
IDeAS_fraction
0.00485

## Test Correlation

cormat <- cor(predTestDriSpear,method="spearman")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

pander::pander(c(Test=max(abs(cormat))))
Test
0.976
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.473 0.562 0.634 0.72 0.859
pander::pander(c(IDeAS_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
IDeAS_fraction
0.00334

gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  mar = c(5,5),
                  col=rev(heat.colors(5)),
                  breaks = breaks,
                  main = "Correlation: Driven/Spearman",
                  cexRow = 0.25,
                  cexCol = 0.25,
                  srtCol=35,
                  srtRow=75,
                  key.title=NA,
                  key.xlab="Spearman Correlation",
                  xlab="Feature", ylab="Feature")


#hist(cormat,freq=FALSE,
#     density=NULL,
#     xlim=c(-1,1),
#     ylim=c(0,4.0),
#     main="Correlation after UPSTM with Spearman",xlab="Spearman Correlation")

DeDrivSpearDen <- density(cormat,from=-1,to=1)
par(op)

1.10.6 PCA

par(cex=1.0,cex.main=0.8)



## Train Correlation

cormat <- cor(PCA_Train,method="pearson")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

pander::pander(c(Train=max(abs(cormat))))
Train
0.999
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.995 0.997 0.998 0.999 0.999
pander::pander(c(PCA_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
PCA_fraction
0.694

## Test Correlation
cormat <- cor(PCA_Predicted,method="spearman")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

pander::pander(c(Test=max(abs(cormat))))
Test
0.999
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.992 0.995 0.996 0.998 0.999
pander::pander(c(PCA_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
PCA_fraction
0.689

gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  mar = c(5,5),
                  col=rev(heat.colors(5)),
                  breaks = breaks,
                  main = "Correlation after PCA",
                  cexRow = 0.25,
                  cexCol = 0.25,
                  srtCol=35,
                  srtRow=75,
                  key.title=NA,
                  key.xlab="Spearman Correlation",
                  xlab="Feature", ylab="Feature")



#hist(cormat,freq=FALSE,
#     density=NULL,
#     xlim=c(-1,1),
#     ylim=c(0,4.0),
#     main="Correlation after PCA",xlab="Spearman Correlation")

PCADen <- density(cormat,from=-1,to=1)

par(op)

1.10.7 EFA

par(cex=1.0,cex.main=0.8)

## Train Correlation

cormat <- cor(EFA_Train,method="pearson")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

pander::pander(c(Train=max(abs(cormat))))
Train
1
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
1 1 1 1 1
pander::pander(c(EFA_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
EFA_fraction
0.776

## Test Correlation
cormat <- cor(EFA_Predicted,method="spearman")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

pander::pander(c(Test=max(abs(cormat))))
Test
1
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
1 1 1 1 1
pander::pander(c(EFA_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
EFA_fraction
0.776

gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  mar = c(5,5),
                  col=rev(heat.colors(5)),
                  breaks = breaks,
                  main = "Correlation after EFA",
                  cexRow = 0.25,
                  cexCol = 0.25,
                  srtCol=35,
                  srtRow=75,
                  key.title=NA,
                  key.xlab="Spearman Correlation",
                  xlab="Feature", ylab="Feature")



#hist(cormat,freq=FALSE,
#     density=NULL,
#     xlim=c(-1,1),
#     ylim=c(0,4.0),
#     main="Correlation after EFA",xlab="Spearman Correlation")

EFADen <- density(cormat,from=-1,to=1)
par(op)

1.10.8 PCA Whitening



## Train Correlation

cormat <- cor(PCAWhite_Train,method="pearson")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

pander::pander(c(Train=max(abs(cormat))))
Train
0.46
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
9.48e-11 0.0465 0.108 0.301 0.46
pander::pander(c(PCAWhite_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
PCAWhite_fraction
0

## Test Correlation
cormat <- cor(PCAWhitePredicted,method="spearman")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

pander::pander(c(Test=max(abs(cormat))))
Test
0.716
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.462 0.538 0.57 0.653 0.716
pander::pander(c(PCAWhite_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
PCAWhite_fraction
0

gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  mar = c(5,5),
                  col=rev(heat.colors(5)),
                  breaks = breaks,
                  main = "Correlation after PCAWhite",
                  cexRow = 0.25,
                  cexCol = 0.25,
                  srtCol=35,
                  srtRow=75,
                  key.title=NA,
                  key.xlab="Spearman Correlation",
                  xlab="Feature", ylab="Feature")



#hist(cormat,freq=FALSE,
#     density=NULL,
#     xlim=c(-1,1),
#     ylim=c(0,4.0),
#     main="Correlation after PCAWhite",xlab="Spearman Correlation")

WhiteDen <- density(cormat,from=-1,to=1)
par(op)

1.11 The Density Plot

par(cex=0.7)
colors=c("red","blue","green","darkblue","darkgreen","purple","orange","darkred");

plot(rawDen,
     xlim=c(-1,1),
     ylim=c(0.001,7.0),
     col=colors[1],
     lty=1,
     lwd=4,
     log="y",
     main="Test: Correlation Distribution",xlab="Spearman Correlation")

lines(DeDen,col=colors[2],lty=2,lwd=4)
lines(DeSpearDen,col=colors[3],lty=3,lwd=4)
lines(DeDrivDen,col=colors[4],lty=4,lwd=2)
lines(DeDrivSpearDen,col=colors[5],lty=5,lwd=2)

lines(PCADen,col=colors[6],lty=6,lwd=1)
lines(EFADen,col=colors[7],lty=7,lwd=1)
lines(WhiteDen,col=colors[8],lty=8,lwd=1)

names=c("Raw","IDeA:P","IDeA:S","DIDeA:P","DIDeA:S","PCA","EFA","White:PCA")
#colors=c("red","blue","green","blue","green","purple","purple","gray");
lines=c(1,2,3,4,5,6,7,8)
lwds=c(4,4,4,2,2,1,1,1)

legend("topleft",names,col=colors,lty=lines,lwd=lwds,cex=0.50)

par(op)

1.11.1 Differences between train and test ROC AUC

par(op)
par(mfrow=c(1,1),cex=0.7)

AUCResults <- list();
diffAUC <- list();
thenames <- rownames(univar$orderframe)[(rownames(univar$orderframe) %in% colnames(testDataFrame))]
rawAUC <- univar$orderframe[thenames,"ROCAUC"]
thenames <- thenames[rawAUC >= aucTHR]
rawAUC <- univar$orderframe[thenames,"ROCAUC"]
rawAUCTest <- univarTest$orderframe[thenames,"ROCAUC"]
AUCResults$RAW_T <- rawAUCTest
diffAUC$RAW <-  rawAUCTest-rawAUC
plot(rawAUC,rawAUCTest-rawAUC,
     xlab="TRAIN ROC AUC",
     ylab="Test:AUC-Train:AUC",
     xlim=c(0.5,1.0),
     ylim=c(-0.25,0.25),
     pch=1,
     col=colors[1],
     main="ROC AUC Difference Between Test and Train")

thenames <- rownames(univarDe$orderframe)[!(rownames(univarDe$orderframe) %in% colnames(testDataFrame))]
IDeAP <- univarDe$orderframe[thenames,"ROCAUC"]
thenames <- thenames[IDeAP >= aucTHR]
IDeAP <- univarDe$orderframe[thenames,"ROCAUC"]
IDeAPTest <- univarDeTest$orderframe[thenames,"ROCAUC"]
AUCResults$IDeAP <- IDeAP
AUCResults$IDeAP_T <- IDeAPTest
diffAUC$IDeAP <-  IDeAPTest-IDeAP

points(IDeAP,IDeAPTest-IDeAP,pch=2,col=colors[2])

thenames <- rownames(univarDeSpear$orderframe)[!(rownames(univarDeSpear$orderframe) %in% colnames(testDataFrame))]
IDeAS <- univarDeSpear$orderframe[thenames,"ROCAUC"]
thenames <- thenames[IDeAS >= aucTHR]
IDeAS <- univarDeSpear$orderframe[thenames,"ROCAUC"]
IDeASTest <- univarDeSpearTest$orderframe[thenames,"ROCAUC"]
AUCResults$IDeAS <- IDeAS
AUCResults$IDeAS_T <- IDeASTest
diffAUC$IDeAS <-  IDeASTest-IDeAS

points(IDeAS,IDeASTest-IDeAS,pch=3,col=colors[3])

thenames <- rownames(univarDeDri$orderframe)[!(rownames(univarDeDri$orderframe) %in% colnames(testDataFrame))]
DIDeAP <- univarDeDri$orderframe[thenames,"ROCAUC"]
thenames <- thenames[DIDeAP >= aucTHR]
DIDeAP <- univarDeDri$orderframe[thenames,"ROCAUC"]
DIDeAPTest <- univarDeDriTest$orderframe[thenames,"ROCAUC"]
AUCResults$DIDeAP <- DIDeAP
AUCResults$DIDeAP_T <- DIDeAPTest
diffAUC$DIDeAP <-  DIDeAPTest-DIDeAP

points(DIDeAP,DIDeAPTest-DIDeAP,pch=4,col=colors[4])

thenames <- rownames(univarDeDriSpear$orderframe)[!(rownames(univarDeDriSpear$orderframe) %in% colnames(testDataFrame))]
DIDeAS <- univarDeDriSpear$orderframe[thenames,"ROCAUC"]
thenames <- thenames[DIDeAS >= aucTHR]
DIDeAS <- univarDeDriSpear$orderframe[thenames,"ROCAUC"]
DIDeASTest <- univarDeDriSpearTest$orderframe[thenames,"ROCAUC"]
AUCResults$DIDeAS <- DIDeAS
AUCResults$DIDeAS_T <- DIDeASTest
diffAUC$DIDeAS <-  DIDeASTest-DIDeAS

points(DIDeAS,DIDeASTest-DIDeAS,pch=5,col=colors[5])

thenames <- rownames(univarPCA$orderframe)[!(rownames(univarPCA$orderframe) %in% colnames(testDataFrame))]
PCA <- univarPCA$orderframe[thenames,"ROCAUC"]
thenames <- thenames[PCA >= aucTHR]
PCA <- univarPCA$orderframe[thenames,"ROCAUC"]
PCATest <- univarPCATest$orderframe[thenames,"ROCAUC"]
AUCResults$PCA <- PCA
AUCResults$PCA_T <- PCATest
diffAUC$PCA <-  PCATest-PCA

points(PCA,PCATest-PCA,pch=6,col=colors[6])

thenames <- rownames(univarEFA$orderframe)[!(rownames(univarEFA$orderframe) %in% colnames(testDataFrame))]
EFA <- univarEFA$orderframe[thenames,"ROCAUC"]
thenames <- thenames[EFA >= aucTHR]
EFA <- univarEFA$orderframe[thenames,"ROCAUC"]

EFATest <- univarEFATest$orderframe[thenames,"ROCAUC"]
AUCResults$EFA <- EFA
AUCResults$EFA_T <- EFATest
diffAUC$EFA <-  EFATest-EFA

points(EFA,EFATest-EFA,pch=7,col=colors[7])

thenames <- rownames(univarWhite$orderframe)[!(rownames(univarWhite$orderframe) %in% colnames(testDataFrame))]
WPCA <- univarWhite$orderframe[thenames,"ROCAUC"]
thenames <- thenames[WPCA >= aucTHR]
WPCA <- univarWhite$orderframe[thenames,"ROCAUC"]
WPCATest <- univarWhiteTest$orderframe[thenames,"ROCAUC"]
AUCResults$WPCA <- WPCA
AUCResults$WPCA_T <- WPCATest
diffAUC$WPCA <-  WPCATest-WPCA


points(WPCA,WPCATest-WPCA,pch=8,col=colors[8])


names=c("Raw","IDeA:P","IDeA:S","DIDeA:P","DIDeA:S","PCA","EFA","White:PCA")
pchs=c(1,2,3,4,5,6,7,8)

legend("bottomright",names,col=colors,pch=pchs,cex=0.50)

par(op)

1.11.2 Violin of differences


par(op)
par(mfrow=c(1,1),cex=0.7)

vioplot(diffAUC,
        ylim=c(-0.25,0.25),
        ylab="Test-Train",
        main="Test-Train Paired ROC AUC Difference",
        col=colors,
        cex.axis=0.6,
        las=2
)
stripchart(diffAUC, method = "jitter", col = "gray",
           vertical = TRUE, pch = 1, add = TRUE,cex=0.35)
points(1:length(diffAUC),lapply(diffAUC,mean), pch = 18, col = "yellow", cex = 2.25)

par(op)

1.11.3 Distribution of ROC AUC in latent Variables

par(op)
par(mfrow=c(1,1),cex=0.7)

colors2 <- length(AUCResults)
colors2[1] <- colors[1];
colors2[2*(1:(length(colors)-1))] <- colors[2:length(colors)]
colors2[1+2*(1:(length(colors)-1))] <- colors[2:length(colors)]
vioplot(AUCResults,
        ylim=c(0.3,1.0),
        ylab="ROC AUC",
        main="ROC AUC of Latent Variables",
        col=colors2,
        cex.axis=0.6,
        las=2
)
abline(h=0.5,col="black")
stripchart(AUCResults, method = "jitter", col = "gray",
           vertical = TRUE, pch = 1, add = TRUE,cex=0.35)
points(1:length(AUCResults),lapply(AUCResults,mean), pch = 18, col = "yellow", cex = 2.25)

par(op)

1.11.4 Differences between train and test Balanced Accuracy

par(op)
par(mfrow=c(1,1),cex=0.7)
BACCTHR <- 0.5
BACCResults <- list();
diffBACC <- list();
thenames <- rownames(univar$orderframe)[(rownames(univar$orderframe) %in% colnames(testDataFrame))]
rawBACC <- univar$orderframe[thenames,"BACC"]
thenames <- thenames[rawBACC >= BACCTHR]
rawBACC <- univar$orderframe[thenames,"BACC"]
rawBACCTest <- univarTest$orderframe[thenames,"BACC"]
BACCResults$RAW <- rawBACCTest
diffBACC$RAW <-  rawBACCTest-rawBACC
plot(rawBACC,rawBACCTest-rawBACC,
     xlab="TRAIN Balanced Acc",
     ylab="Test:BACC-Train:BACC",
     xlim=c(0.5,1.0),
     ylim=c(-0.25,0.25),
     pch=1,
     col=colors[1],
     main="Balanced Acc Difference Between Test and Train")

thenames <- rownames(univarDe$orderframe)[!(rownames(univarDe$orderframe) %in% colnames(testDataFrame))]
IDeAP <- univarDe$orderframe[thenames,"BACC"]
thenames <- thenames[IDeAP >= BACCTHR]
IDeAP <- univarDe$orderframe[thenames,"BACC"]
IDeAPTest <- univarDeTest$orderframe[thenames,"BACC"]
BACCResults$IDeAP <- IDeAP
BACCResults$IDeAP_T <- IDeAPTest
diffBACC$IDeAP <-  IDeAPTest-IDeAP

points(IDeAP,IDeAPTest-IDeAP,pch=2,col=colors[2])

thenames <- rownames(univarDeSpear$orderframe)[!(rownames(univarDeSpear$orderframe) %in% colnames(testDataFrame))]
IDeAS <- univarDeSpear$orderframe[thenames,"BACC"]
thenames <- thenames[IDeAS >= BACCTHR]
IDeAS <- univarDeSpear$orderframe[thenames,"BACC"]
IDeASTest <- univarDeSpearTest$orderframe[thenames,"BACC"]
BACCResults$IDeAS <- IDeAS
BACCResults$IDeAS_T <- IDeASTest
diffBACC$IDeAS <-  IDeASTest-IDeAS

points(IDeAS,IDeASTest-IDeAS,pch=3,col=colors[3])

thenames <- rownames(univarDeDri$orderframe)[!(rownames(univarDeDri$orderframe) %in% colnames(testDataFrame))]
DIDeAP <- univarDeDri$orderframe[thenames,"BACC"]
thenames <- thenames[DIDeAP >= BACCTHR]
DIDeAP <- univarDeDri$orderframe[thenames,"BACC"]
DIDeAPTest <- univarDeDriTest$orderframe[thenames,"BACC"]
BACCResults$DIDeAP <- DIDeAP
BACCResults$DIDeAP_T <- DIDeAPTest
diffBACC$DIDeAP <-  DIDeAPTest-DIDeAP

points(DIDeAP,DIDeAPTest-DIDeAP,pch=4,col=colors[4])

thenames <- rownames(univarDeDriSpear$orderframe)[!(rownames(univarDeDriSpear$orderframe) %in% colnames(testDataFrame))]
DIDeAS <- univarDeDriSpear$orderframe[thenames,"BACC"]
thenames <- thenames[DIDeAS >= BACCTHR]
DIDeAS <- univarDeDriSpear$orderframe[thenames,"BACC"]
DIDeASTest <- univarDeDriSpearTest$orderframe[thenames,"BACC"]
BACCResults$DIDeAS <- DIDeAS
BACCResults$DIDeAS_T <- DIDeASTest
diffBACC$DIDeAS <-  DIDeASTest-DIDeAS

points(DIDeAS,DIDeASTest-DIDeAS,pch=5,col=colors[5])

thenames <- rownames(univarPCA$orderframe)[!(rownames(univarPCA$orderframe) %in% colnames(testDataFrame))]
PCA <- univarPCA$orderframe[thenames,"BACC"]
thenames <- thenames[PCA >= BACCTHR]
PCA <- univarPCA$orderframe[thenames,"BACC"]
PCATest <- univarPCATest$orderframe[thenames,"BACC"]
BACCResults$PCA <- PCA
BACCResults$PCA_T <- PCATest
diffBACC$PCA <-  PCATest-PCA

points(PCA,PCATest-PCA,pch=6,col=colors[6])

thenames <- rownames(univarEFA$orderframe)[!(rownames(univarEFA$orderframe) %in% colnames(testDataFrame))]
EFA <- univarEFA$orderframe[thenames,"BACC"]
thenames <- thenames[EFA >= BACCTHR]
EFA <- univarEFA$orderframe[thenames,"BACC"]

EFATest <- univarEFATest$orderframe[thenames,"BACC"]
BACCResults$EFA <- EFA
BACCResults$EFA_T <- EFATest
diffBACC$EFA <-  EFATest-EFA

points(EFA,EFATest-EFA,pch=7,col=colors[7])

thenames <- rownames(univarWhite$orderframe)[!(rownames(univarWhite$orderframe) %in% colnames(testDataFrame))]
WPCA <- univarWhite$orderframe[thenames,"BACC"]
thenames <- thenames[WPCA >= BACCTHR]
WPCA <- univarWhite$orderframe[thenames,"BACC"]
WPCATest <- univarWhiteTest$orderframe[thenames,"BACC"]
BACCResults$WPCA <- WPCA
BACCResults$WPCA_T <- WPCATest
diffBACC$WPCA <-  WPCATest-WPCA


points(WPCA,WPCATest-WPCA,pch=8,col=colors[8])


names=c("Raw","IDeA:P","IDeA:S","DIDeA:P","DIDeA:S","PCA","EFA","White:PCA")
pchs=c(1,2,3,4,5,6,7,8)

legend("bottomright",names,col=colors,pch=pchs,cex=0.50)

par(op)

1.11.5 Violin of differences


par(op)
par(mfrow=c(1,1),cex=0.7)

vioplot(diffBACC,
        ylim=c(-0.25,0.25),
        ylab="Test-Train",
        main="Test-Train Paired Balanced Acc Difference",
        col=colors,
        cex.axis=0.6,
        las=2
)
stripchart(diffBACC, method = "jitter", col = "gray",
           vertical = TRUE, pch = 1, add = TRUE,cex=0.35)
points(1:length(diffBACC),lapply(diffBACC,mean), pch = 18, col = "yellow", cex = 2.25)

par(op)

1.11.6 Distribution of Balanced Acc in latent Variables

par(op)
par(mfrow=c(1,1),cex=0.7)

colors2 <- length(BACCResults)
colors2[1] <- colors[1];
colors2[2*(1:(length(colors)-1))] <- colors[2:length(colors)]
colors2[1+2*(1:(length(colors)-1))] <- colors[2:length(colors)]
vioplot(BACCResults,
        ylim=c(0.3,1.0),
        ylab="Balanced Acc",
        main="Balanced Acc of Latent Variables",
        col=colors2,
        cex.axis=0.6,
        las=2
)
abline(h=0.5,col="black")
stripchart(BACCResults, method = "jitter", col = "gray",
           vertical = TRUE, pch = 1, add = TRUE,cex=0.35)
points(1:length(BACCResults),lapply(BACCResults,mean), pch = 18, col = "yellow", cex = 2.25)

par(op)

1.11.7 Differences between train and test IDI

par(op)
par(mfrow=c(1,1),cex=0.7)

testIDI <- list();
diffIDI <- list();
rawIDI <- univar$orderframe$IDI
rawIDITest <- univarTest$orderframe$IDI
testIDI$RAW <- rawIDITest
diffIDI$RAW <-  rawIDITest-rawIDI
plot(rawIDI,rawIDITest-rawIDI,
     xlab="TRAIN Test IDI",
     ylab="Test:IDI-Train:IDI",
     xlim=c(0,0.5),
     ylim=c(-0.2,0.2),
     pch=1,
     col=colors[1],
     main="Predict IDI Difference Between Test and Train")

IDeAP <- univarDe$orderframe$IDI[!(rownames(univarDe$orderframe) %in% colnames(testDataFrame))]
IDeAPTest <-univarDeTest$orderframe$IDI[!(rownames(univarDe$orderframe) %in% colnames(testDataFrame))]
testIDI$IDeAP <- IDeAP
testIDI$IDeAP_T <- IDeAPTest
diffIDI$IDeAP <-  IDeAPTest-IDeAP

points(IDeAP,IDeAPTest-IDeAP,pch=2,col=colors[2])

IDeAS <- univarDeSpear$orderframe$IDI[!(rownames(univarDeSpearTest$orderframe) %in% colnames(testDataFrame))]
IDeASTest <- univarDeSpearTest$orderframe$IDI[!(rownames(univarDeSpearTest$orderframe) %in% colnames(testDataFrame))]
testIDI$IDeAS <- IDeAS
testIDI$IDeAS_T <- IDeASTest
diffIDI$IDeAS <-  IDeASTest-IDeAS

points(IDeAS,IDeASTest-IDeAS,pch=3,col=colors[3])

DIDeAP <- univarDeDri$orderframe$IDI[!(rownames(univarDeDriTest$orderframe) %in% colnames(testDataFrame))]
DIDeAPTest <- univarDeDriTest$orderframe$IDI[!(rownames(univarDeDriTest$orderframe) %in% colnames(testDataFrame))]
testIDI$DIDeAP <- DIDeAP
testIDI$DIDeAP_T <- DIDeAPTest
diffIDI$DIDeAP <-  DIDeAPTest-DIDeAP

points(DIDeAP,DIDeAPTest-DIDeAP,pch=4,col=colors[4])

DIDeAS <- univarDeDriSpear$orderframe$IDI[!(rownames(univarDeDriSpearTest$orderframe) %in% colnames(testDataFrame))]
DIDeASTest <- univarDeDriSpearTest$orderframe$IDI[!(rownames(univarDeDriSpearTest$orderframe) %in% colnames(testDataFrame))]
testIDI$DIDeAS <- DIDeAS
testIDI$DIDeAS_T <- DIDeASTest
diffIDI$DIDeAS <-  DIDeASTest-DIDeAS

points(DIDeAS,DIDeASTest-DIDeAS,pch=5,col=colors[5])

PCA <- univarPCA$orderframe$IDI[!(rownames(univarPCA$orderframe) %in% colnames(testDataFrame))]
PCATest <- univarPCATest$orderframe$IDI[!(rownames(univarPCA$orderframe) %in% colnames(testDataFrame))]
testIDI$PCA <- PCA
testIDI$PCA_T <- PCATest
diffIDI$PCA <-  PCATest-PCA

points(PCA,PCATest-PCA,pch=6,col=colors[6])

EFA <- univarEFA$orderframe$IDI[!(rownames(univarEFA$orderframe) %in% colnames(testDataFrame))]
EFATest <- univarEFATest$orderframe$IDI[!(rownames(univarEFA$orderframe) %in% colnames(testDataFrame))]
testIDI$EFA <- EFA
testIDI$EFA_T <- EFATest
diffIDI$EFA <-  EFATest-EFA

points(EFA,EFATest-EFA,pch=7,col=colors[7])

WPCA <- univarWhite$orderframe$IDI[!(rownames(univarWhite$orderframe) %in% colnames(testDataFrame))]
WPCATest <- univarWhiteTest$orderframe$IDI[!(rownames(univarWhite$orderframe) %in% colnames(testDataFrame))]
testIDI$WPCA <- WPCA
testIDI$WPCA_T <- WPCATest
diffIDI$WPCA <-  WPCATest-WPCA


points(WPCA,WPCATest-WPCA,pch=8,col=colors[8])


names=c("Raw","IDeA:P","IDeA:S","DIDeA:P","DIDeA:S","PCA","EFA","White:PCA")
pchs=c(1,2,3,4,5,6,7,8)

legend("bottomright",names,col=colors,pch=pchs,cex=0.50)

par(op)

1.11.8 Violin of differences


par(op)
par(mfrow=c(1,1),cex=0.7)

vioplot(diffIDI,
        ylim=c(-0.2,0.2),
        ylab="Test-Train",
        main="Test-Train Paired Predict IDI Difference",
        col=colors,
        cex.axis=0.6,
        las=2
)
stripchart(diffIDI, method = "jitter", col = "gray",
           vertical = TRUE, pch = 1, add = TRUE,cex=0.35)
points(1:length(diffIDI),lapply(diffIDI,mean), pch = 18, col = "yellow", cex = 2.25)

par(op)

1.11.9 Distribution of Predict IDI in latent Variables

par(op)
par(mfrow=c(1,1),cex=0.7)

colors2 <- length(testIDI)
colors2[1] <- colors[1];
colors2[2*(1:(length(colors)-1))] <- colors[2:length(colors)]
colors2[1+2*(1:(length(colors)-1))] <- colors[2:length(colors)]
vioplot(testIDI,
        ylim=c(0.0,0.5),
        ylab="Predict IDI",
        main="Predict IDI of Latent Variables",
        col=colors2,
        cex.axis=0.6,
        las=2
)
stripchart(testIDI, method = "jitter", col = "gray",
           vertical = TRUE, pch = 1, add = TRUE,cex=0.35)
points(1:length(testIDI),lapply(testIDI,mean), pch = 18, col = "yellow", cex = 2.25)

par(op)

1.11.10 The tables


pander::pander(univarTest$orderframe[1:TopVariables,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP IDI ROCAUC cStatCorr
V_904 0.224 0.125 0.264 0.118 0.896 0.0284 0.596 0.600
V_918 0.231 0.130 0.271 0.120 0.874 0.0257 0.592 0.596
V_916 0.230 0.129 0.270 0.119 0.883 0.0264 0.596 0.596
V_914 0.228 0.128 0.269 0.119 0.890 0.0269 0.596 0.596
V_912 0.227 0.127 0.267 0.118 0.893 0.0273 0.596 0.596
pander::pander(univarDeTest$orderframe[1:TopVariables,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP IDI ROCAUC cStatCorr
La_V_1008 1.77e-04 1.94e-03 2.77e-03 1.24e-03 0.602 0.29673 0.742 0.871
La_V_1154 -7.96e-06 1.72e-04 -2.07e-04 1.33e-04 0.932 0.11832 0.637 0.849
La_V_1048 -4.38e-05 4.95e-05 1.11e-05 2.97e-05 0.505 -0.00487 0.533 0.849
La_V_1046 -9.42e-04 3.65e-03 -5.11e-03 3.44e-03 0.721 0.28722 0.787 0.844
La_V_1240 -7.30e-05 2.85e-04 -3.63e-04 2.11e-04 0.267 0.00547 0.483 0.822
pander::pander(univarDeSpearTest$orderframe[1:TopVariables,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP IDI ROCAUC cStatCorr
La_V_1052 -1.60e-03 1.49e-03 8.17e-04 1.60e-03 0.393 0.343 0.804 0.911
La_V_1100 -7.37e-03 1.84e-02 1.36e-02 1.65e-02 0.170 0.231 0.750 0.836
La_V_1040 -2.55e-05 7.84e-05 8.28e-05 9.01e-05 0.564 0.224 0.692 0.831
La_V_930 3.22e-05 1.12e-04 1.75e-04 1.38e-04 0.187 0.170 0.650 0.827
La_V_1232 -2.75e-03 2.91e-03 -6.00e-03 3.24e-03 0.853 0.162 0.713 0.822
pander::pander(univarDeDriTest$orderframe[1:TopVariables,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP IDI ROCAUC cStatCorr
La_V_1008 1.77e-04 1.94e-03 2.77e-03 1.24e-03 0.602 0.29673 0.742 0.871
La_V_1154 -7.96e-06 1.72e-04 -2.07e-04 1.33e-04 0.932 0.11832 0.637 0.849
La_V_1048 -4.38e-05 4.95e-05 1.11e-05 2.97e-05 0.505 -0.00487 0.533 0.849
La_V_1046 -9.42e-04 3.65e-03 -5.11e-03 3.44e-03 0.721 0.28722 0.787 0.844
La_V_1240 -7.30e-05 2.85e-04 -3.63e-04 2.11e-04 0.267 0.00547 0.483 0.822
pander::pander(univarDeDriSpearTest$orderframe[1:TopVariables,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP IDI ROCAUC cStatCorr
La_V_1052 -1.60e-03 1.49e-03 8.17e-04 1.60e-03 0.393 0.343 0.804 0.911
La_V_1100 -7.37e-03 1.84e-02 1.36e-02 1.65e-02 0.170 0.231 0.750 0.836
La_V_1040 -2.55e-05 7.84e-05 8.28e-05 9.01e-05 0.564 0.224 0.692 0.831
La_V_930 3.22e-05 1.12e-04 1.75e-04 1.38e-04 0.187 0.170 0.650 0.827
La_V_1232 -2.75e-03 2.91e-03 -6.00e-03 3.24e-03 0.853 0.162 0.713 0.822
pander::pander(univarPCATest$orderframe[1:TopVariables,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP IDI ROCAUC cStatCorr
RC11 -0.00596 0.0442 0.00596 0.0375 0.832 0.0165 0.608 0.596
RC7 -0.04007 0.5290 0.04007 0.4199 0.962 0.0092 0.567 0.582
RC17 -0.00138 0.0143 0.00138 0.0129 0.901 0.0112 0.571 0.582
RC2 -16.85713 176.4865 16.85713 159.9626 0.838 0.0116 0.562 0.578
RC5 -0.37045 2.6075 0.37045 2.7390 0.859 0.0197 0.575 0.578
pander::pander(univarEFATest$orderframe[1:TopVariables,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP IDI ROCAUC cStatCorr
MR7 -0.00911 0.0134 0.00911 0.0347 0.285 0.07932 0.575 0.684
MR2 -16.86057 176.5327 16.86057 160.0038 0.838 0.01157 0.562 0.578
MR4 -0.36366 2.5412 0.36366 2.6783 0.860 0.01991 0.575 0.578
MR8 -0.03165 0.4469 0.03165 0.3505 0.888 0.00846 0.562 0.578
MR10 -0.00556 0.0453 0.00556 0.0425 0.884 0.01779 0.596 0.578
pander::pander(univarWhiteTest$orderframe[1:TopVariables,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP IDI ROCAUC cStatCorr
L17 -0.553 0.992 0.351 0.806 0.921 0.2389 0.729 0.782
L5 0.565 0.805 1.391 1.028 0.462 -0.0320 0.496 0.747
L7 0.665 0.666 0.276 1.243 0.489 0.0692 0.704 0.711
L2 -0.259 1.108 -0.909 0.785 0.999 0.1084 0.688 0.693
L9 1.086 1.191 1.677 0.682 0.746 0.0320 0.608 0.680

topUni <- univar$orderframe$Name[1:TopVariables]
topDe <- univarDe$orderframe$Name[1:TopVariables]
topDeSpear <- univarDeSpear$orderframe$Name[1:TopVariables]
topDeDri <- univarDeDri$orderframe$Name[1:TopVariables]
topDeDriSpear <- univarDeDriSpear$orderframe$Name[1:TopVariables]
topPCA <- univarPCA$orderframe$Name[1:TopVariables]
topEFA <- univarEFA$orderframe$Name[1:TopVariables]
topPCAWhite <- univarWhite$orderframe$Name[1:TopVariables]

1.11.11 Model of top variables

par(mfrow=c(1,2),cex=0.6)

lmRAW <- glm(paste(outcome,"~."),
             trainDataFrame[,c(outcome,topUni)],
             family="binomial")
prRaw <- predictionStats_binary(cbind(testDataFrame[,outcome],predict(lmRAW,testDataFrame)),"Top Raw",cex=0.75)

Top Raw


lmDe <- glm(paste(outcome,"~."),
            DEdataframe[,c(outcome,topDe)],
            family="binomial")
prDe <- predictionStats_binary(cbind(predTestDe[,outcome],predict(lmDe,predTestDe)),"Top IDeA:P",cex=0.75)

Top IDeA:P


lmDeSpear <- glm(paste(outcome,"~."),
            DEdataframeSpear[,c(outcome,topDeSpear)],
            family="binomial")
prSpear <- predictionStats_binary(cbind(predTestDeSpear[,outcome],predict(lmDeSpear,predTestDeSpear)),"Top IDeA:S",cex=0.75)

Top IDeA:S


lmDri <- glm(paste(outcome,"~."),
            DriDEdataframe[,c(outcome,topDeDri)],
            family="binomial")
prDri <- predictionStats_binary(cbind(predTestDe[,outcome],predict(lmDri,predTestDri)),"Top DIDeA:P",cex=0.75)

Top DIDeA:P


lmDriSpear <- glm(paste(outcome,"~."),
            DriDEdataframeSpear[,c(outcome,topDeDriSpear)],
            family="binomial")
prDriSpear <- predictionStats_binary(cbind(predTestDriSpear[,outcome],predict(lmDriSpear,predTestDriSpear)),"Top DIDeA:S",cex=0.7)

Top DIDeA:S



lmPCA <- glm(paste(outcome,"~."),
            PCA_Train[,c(outcome,topPCA)],
            family="binomial")
prPCA <- predictionStats_binary(cbind(PCA_Predicted[,outcome],predict(lmPCA,PCA_Predicted)),"Top PCA",cex=0.75)

Top PCA



lmEFA <- glm(paste(outcome,"~."),
            EFA_Train[,c(outcome,topEFA)],
            family="binomial")
prEFA <- predictionStats_binary(cbind(EFA_Predicted[,outcome],predict(lmEFA,EFA_Predicted)),"Top EFA",cex=0.75)

Top EFA



lmPCAW <- glm(paste(outcome,"~."),
            PCAWhite_Train[,c(outcome,topPCAWhite)],
            family="binomial")
prWPCA <- predictionStats_binary(cbind(PCAWhitePredicted[,outcome],predict(lmPCAW,PCAWhitePredicted)),"Top White:PCA",cex=0.75)

Top White:PCA

par(op)

1.11.12 The Performance Tables and Plots


par(cex=0.6)

 aucs <- prRaw$aucs
  aucs <- rbind(aucs,prDe$aucs)
  aucs <- rbind(aucs,prSpear$aucs)
  aucs <- rbind(aucs,prDri$aucs)
  aucs <- rbind(aucs,prDriSpear$aucs)
  aucs <- rbind(aucs,prPCA$aucs)
  aucs <- rbind(aucs,prEFA$aucs)
  aucs <- rbind(aucs,prWPCA$aucs)

  
  rownames(aucs) <- c("RAW",
                        "IDeA:P",
                        "IDeA:S",
                        "DIDeA:P",
                        "DIDeA:S",
                        "PCA",
                        "EFA",
                        "WPCA"
                        )
  
  pander::pander(aucs)
  est lower upper
RAW 0.771 0.597 0.945
IDeA:P 0.713 0.524 0.901
IDeA:S 0.808 0.644 0.972
DIDeA:P 0.713 0.524 0.901
DIDeA:S 0.808 0.644 0.972
PCA 0.688 0.492 0.883
EFA 0.717 0.530 0.903
WPCA 0.800 0.644 0.956
  
  bpAUC <- barPlotCiError(as.matrix(aucs),
                          metricname = "ROC AUC",
                          thesets = "Test AUC",
                          themethod = rownames(aucs),
                          main = "ROC AUC",
                          offsets = c(0.5,1),
                          scoreDirection = ">",
                          ho=0.5,
                          args.legend = list(bg = "white",x="bottomleft",inset=c(0.0,0),cex=0.5),
                          col = terrain.colors(nrow(aucs))
                          )


  
 berror <- prRaw$berror
  berror <- rbind(berror,prDe$berror)
  berror <- rbind(berror,prSpear$berror)
  berror <- rbind(berror,prDri$berror)
  berror <- rbind(berror,prDriSpear$berror)
  berror <- rbind(berror,prPCA$berror)
  berror <- rbind(berror,prEFA$berror)
  berror <- rbind(berror,prWPCA$berror)


  rownames(berror) <- rownames(aucs)
  pander::pander(berror)
  50% 2.5% 97.5%
RAW 0.290 0.132 0.452
IDeA:P 0.355 0.183 0.517
IDeA:S 0.288 0.145 0.450
DIDeA:P 0.352 0.195 0.529
DIDeA:S 0.279 0.136 0.452
PCA 0.349 0.188 0.529
EFA 0.387 0.225 0.548
WPCA 0.325 0.176 0.487

  bpBER <- barPlotCiError(as.matrix(berror),
                          metricname = "Balanced Error Rate",
                          thesets = "Test BER",
                          themethod = rownames(aucs),
                          main = "Balanced Error Rate",
                          offsets = c(0.5,1),
                          scoreDirection = "<",
                          ho=0.5,
                          args.legend = list(bg = "white",x="topleft",inset=c(0.0,0),cex=0.5),
                          col = terrain.colors(nrow(aucs))
                          )

  par(op)